trips_per_day <- read_tsv("trips_per_day.tsv")
set.seed(42)
# sample split using caTools package
sample <- sample.split(trips_per_day$num_trips, SplitRatio = 0.8)
train_data <- subset(trips_per_day, sample== T)
other_data <- subset(trips_per_day, sample== F)
sample2 <- sample.split(other_data$num_trips, SplitRatio = 0.5)
validate_data <- subset(other_data, sample2== T)
test_data <- subset(other_data, sample2== F)
# Check if the split works
nrow(trips_per_day)==sum(nrow(train_data), nrow(test_data), nrow(validate_data))
## [1] TRUE
K <- 1:8
train_err <- c()
validate_err <- c()
for (k in K){
# fit on the training data
model <- lm(num_trips ~ poly(tmin, k, raw = T), train_data)
# evaluate on the training data
# RMSE-> root mean square error. sqrt( (predict-truth)^2 /number of data )
train_err[k] <- sqrt(mean((predict(model, train_data)-train_data$num_trips)^2))
validate_err[k] <- sqrt(mean((predict(model, validate_data ) - validate_data$num_trips)^2))
}
plot_data <- data.frame(K, train_err, validate_err) %>%
gather("split","error",-K)
ggplot(plot_data, aes(x=K, y = error, color = split)) +
geom_line() +
scale_x_continuous(breaks = K) +
xlab('Polynomial Degrees') +
ylab('RMSE')
train_data <- train_data %>%
mutate(weekday = weekdays(as.POSIXct(ymd), abbreviate = T))
validate_data<- validate_data %>%
mutate(weekday = weekdays(as.POSIXct(ymd), abbreviate = T))
#Check if weekdays increases R^2 for the value
model <- lm(num_trips ~. , train_data)
glance(model)
major_holidays <- c("0101", "0120", "0214", "0217", "0526", "0704", "0901", "1013", "1127", "1224", "1225","1231")
train_data <- train_data %>%
mutate(month_day = format(train_data$ymd,"%m%d"),
is_holiday = month_day %in% major_holidays) %>%
select(-month_day)
validate_data <- validate_data %>%
mutate(month_day = format(validate_data$ymd,"%m%d"),
is_holiday = month_day %in% major_holidays) %>%
select(-month_day)
#Check if is_holiday increases R^2 for the value
model <- lm(num_trips ~. , train_data)
glance(model)
flu_season <- c("12", "01", "02")
train_data <- train_data %>%
mutate(month = format(ymd,"%m"),
is_flu_season = month %in% flu_season) %>%
select(-month)
validate_data <- validate_data %>% mutate(month = format(ymd,"%m"),
is_flu_season = month %in% flu_season) %>%
select(-month)
model <- lm(num_trips ~. -ymd -date -snow, train_data)
glance(model)
# fit a model for each polynomial degree consider 7 predictors
K <- 1:8
train_err <- c()
validate_err <- c()
for (k in K){
# fit on the training data
model <- lm(num_trips ~ poly(prcp, k, raw = T) + poly(snwd, k, raw = T) + poly(tmax, k, raw = T) + poly(tmin, k, raw = T) + weekday + is_holiday + is_flu_season, train_data)
# evaluate on the training data
# RMSE-> root mean square error. sqrt( (predict-truth)^2 /number of data )
train_err[k] <- sqrt(mean((predict(model, train_data)-train_data$num_trips)^2))
validate_err[k] <- sqrt(mean((predict(model, validate_data ) - validate_data$num_trips)^2))
}
plot_data <- data.frame(K, train_err, validate_err) %>%
gather("split","error",-K)
ggplot(plot_data, aes(x=K, y = error, color = split)) +
geom_line() +
scale_x_continuous(breaks = K) +
xlab('Polynomial Degrees') +
ylab('RMSE')
Polynomial of Degree two gives lowest validation error. Further investigation still nedded.
# Everything at degree 2
model <- lm(num_trips ~ poly(prcp, 2, raw = T)+poly(snwd, 2, raw = T)+poly(tmax, 2, raw = T)+poly(tmin, 2, raw = T) + weekday + is_holiday + is_flu_season, train_data)
glance(model)
# take out insignificant predictors
model <- lm(num_trips ~ poly(prcp, 2, raw = T)+ snwd + poly(tmax, 2, raw = T)+poly(tmin, 2, raw = T) + weekday + is_holiday + is_flu_season, train_data)
# check R^2
rsquare(model, train_data)
## [1] 0.898932
rsquare(model, validate_data)
## [1] 0.9154361
# take out more insignificant predictors
model <- lm(num_trips ~ poly(prcp, 2, raw = T)+ snwd + tmax + tmin + weekday + is_holiday + is_flu_season, train_data)
#Check rmse and R^2
rmse(model, train_data)
## [1] 3180.246
rmse(model, validate_data)
## [1] 3116.522
rsquare(model, train_data)
## [1] 0.8976948
rsquare(model, validate_data)
## [1] 0.9110483
# Best Model
model <- lm(num_trips ~ poly(prcp, 2, raw = T)+ snwd + tmax + tmin + weekday + is_holiday + is_flu_season, train_data)# 0.8976948
rsquare(model, train_data)
## [1] 0.8976948
tidy(model)
summary(model)
##
## Call:
## lm(formula = num_trips ~ poly(prcp, 2, raw = T) + snwd + tmax +
## tmin + weekday + is_holiday + is_flu_season, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9662.5 -2189.5 306.7 2082.8 7917.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1802.19 1266.18 -1.423 0.15576
## poly(prcp, 2, raw = T)1 -17238.39 1451.91 -11.873 < 2e-16 ***
## poly(prcp, 2, raw = T)2 5723.47 876.09 6.533 3.06e-10 ***
## snwd -186.52 80.73 -2.310 0.02159 *
## tmax 2901.74 426.44 6.805 6.22e-11 ***
## tmin 1164.51 441.58 2.637 0.00883 **
## weekdayMon 4643.40 757.77 6.128 3.04e-09 ***
## weekdaySat -1333.46 753.18 -1.770 0.07775 .
## weekdaySun 4178.14 738.97 5.654 3.89e-08 ***
## weekdayThu 5190.18 746.20 6.955 2.51e-11 ***
## weekdayTue 5603.25 750.92 7.462 1.10e-12 ***
## weekdayWed 5626.15 760.10 7.402 1.60e-12 ***
## is_holidayTRUE -7766.91 1226.65 -6.332 9.70e-10 ***
## is_flu_seasonTRUE -3654.38 648.94 -5.631 4.37e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3259 on 278 degrees of freedom
## Multiple R-squared: 0.8977, Adjusted R-squared: 0.8929
## F-statistic: 187.6 on 13 and 278 DF, p-value: < 2.2e-16
# test the model by combine train_data with validate_data
com_data <- rbind(train_data, validate_data)
rsquare(model, com_data)
## [1] 0.8996773
plot_data <- validate_data %>%
add_predictions(model)
ggplotly(ggplot(plot_data, aes(x= ymd, y = pred))+
geom_point(aes(y= num_trips))+
geom_line(aes(y=pred), color = "red")+
geom_point(aes(y=pred), color = "red") +
geom_smooth() +
xlab("Date") +
ylab("Predicted (in red)/ Actual (in black)")+
ggtitle("Number of trips at different dates"))
ggplot(plot_data, aes(x=pred, y =num_trips ))+
geom_point()+
geom_abline(linetype = "dashed") +
xlab('Predicted') +
ylab('Actual')
test_data <- test_data %>%
mutate(weekday = weekdays(as.POSIXct(ymd), abbreviate = T),
month_day = format(ymd,"%m%d"),
is_holiday = month_day %in% major_holidays,
month = format(ymd,"%m"),
is_flu_season = month %in% flu_season) %>%
select(-month,-month_day)
summary(model)
##
## Call:
## lm(formula = num_trips ~ poly(prcp, 2, raw = T) + snwd + tmax +
## tmin + weekday + is_holiday + is_flu_season, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9662.5 -2189.5 306.7 2082.8 7917.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1802.19 1266.18 -1.423 0.15576
## poly(prcp, 2, raw = T)1 -17238.39 1451.91 -11.873 < 2e-16 ***
## poly(prcp, 2, raw = T)2 5723.47 876.09 6.533 3.06e-10 ***
## snwd -186.52 80.73 -2.310 0.02159 *
## tmax 2901.74 426.44 6.805 6.22e-11 ***
## tmin 1164.51 441.58 2.637 0.00883 **
## weekdayMon 4643.40 757.77 6.128 3.04e-09 ***
## weekdaySat -1333.46 753.18 -1.770 0.07775 .
## weekdaySun 4178.14 738.97 5.654 3.89e-08 ***
## weekdayThu 5190.18 746.20 6.955 2.51e-11 ***
## weekdayTue 5603.25 750.92 7.462 1.10e-12 ***
## weekdayWed 5626.15 760.10 7.402 1.60e-12 ***
## is_holidayTRUE -7766.91 1226.65 -6.332 9.70e-10 ***
## is_flu_seasonTRUE -3654.38 648.94 -5.631 4.37e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3259 on 278 degrees of freedom
## Multiple R-squared: 0.8977, Adjusted R-squared: 0.8929
## F-statistic: 187.6 on 13 and 278 DF, p-value: < 2.2e-16
rmse(model, test_data)
## [1] 12808.13
# negative R^2 !!
rsquare(model, test_data)
## [1] -0.2237912
test_data %>% View
# removed the outlier
plot_data <- test_data %>%
add_predictions(model) %>%
filter(ymd!="2014-04-30")
#Small error R^2 = 0.95
rmse(model, plot_data)
## [1] 2460.244
rsquare(model, plot_data)
## [1] 0.9524131
ggplotly(ggplot(plot_data, aes(x= ymd, y = pred))+
geom_point(aes(y= num_trips))+
geom_line(aes(y=pred), color = "red")+
geom_point(aes(y=pred), color = "red") +
geom_smooth() +
xlab("Date") +
ylab("Predicted (in red)/ Actual (in black)")+
ggtitle("Number of trips at different dates"))